<!DOCTYPE html>
<html lang="zh-CN">
<head>
  <meta charset="UTF-8">
<meta name="viewport" content="width=device-width">
<meta name="theme-color" content="#222">
<meta name="generator" content="Hexo 5.4.0">


  <link rel="icon" type="image/png" sizes="16x16" href="https://cdn.jsdelivr.net/gh/CNhuazhu/Image/avatar.jpg">

<link rel="stylesheet" href="/next/css/main.css">



<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/@fortawesome/fontawesome-free@5.15.3/css/all.min.css" integrity="sha256-2H3fkXt6FEmrReK448mDVGKb3WW2ZZw35gI7vqHOE4Y=" crossorigin="anonymous">
  <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/animate.css@3.1.1/animate.min.css" integrity="sha256-PR7ttpcvz8qrF57fur/yAx1qXMFJeJFiA6pSzWi0OIE=" crossorigin="anonymous">
  <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/@fancyapps/fancybox@3.5.7/dist/jquery.fancybox.min.css" integrity="sha256-Vzbj7sDDS/woiFS3uNKo8eIuni59rjyNGtXfstRzStA=" crossorigin="anonymous">

<script class="next-config" data-name="main" type="application/json">{"hostname":"cnhuazhu.gitee.io","root":"/next/","images":"/next/images","scheme":"Mist","version":"8.6.1","exturl":false,"sidebar":{"position":"left","Muse | Mist":320,"display":"post","padding":18,"offset":12},"copycode":true,"bookmark":{"enable":false,"color":"#222","save":"auto"},"fancybox":true,"mediumzoom":true,"lazyload":false,"pangu":false,"comments":{"style":"tabs","active":null,"storage":true,"lazyload":false,"nav":null},"motion":{"enable":true,"async":false,"transition":{"post_block":"fadeIn","post_header":"fadeInDown","post_body":"fadeInDown","coll_header":"fadeInLeft","sidebar":"fadeInUp"}},"prism":false,"i18n":{"placeholder":"搜索...","empty":"没有找到任何搜索结果：${query}","hits_time":"找到 ${hits} 个搜索结果（用时 ${time} 毫秒）","hits":"找到 ${hits} 个搜索结果"},"path":"/next/search.xml","localsearch":{"enable":true,"trigger":"auto","top_n_per_article":1,"unescape":false,"preload":false}}</script><script src="/next/js/config.js"></script>
<meta name="description" content="简单介绍Scrapy框架的基本使用以及遇到的问题">
<meta property="og:type" content="article">
<meta property="og:title" content="Scrapy框架（一）：基本使用">
<meta property="og:url" content="https://cnhuazhu.gitee.io/next/2021/01/28/Scrapy%E6%A1%86%E6%9E%B6/Scrapy%E6%A1%86%E6%9E%B6%EF%BC%88%E4%B8%80%EF%BC%89%EF%BC%9A%E5%9F%BA%E6%9C%AC%E4%BD%BF%E7%94%A8/index.html">
<meta property="og:site_name" content="花猪のBlog">
<meta property="og:description" content="简单介绍Scrapy框架的基本使用以及遇到的问题">
<meta property="og:locale" content="zh_CN">
<meta property="og:image" content="https://cdn.jsdelivr.net/gh/CNhuazhu/Image/twisted.png">
<meta property="og:image" content="https://cdn.jsdelivr.net/gh/CNhuazhu/Image/tree.png">
<meta property="article:published_time" content="2021-01-28T06:11:46.000Z">
<meta property="article:modified_time" content="2021-02-19T13:44:46.900Z">
<meta property="article:author" content="花猪">
<meta property="article:tag" content="Scrapy">
<meta name="twitter:card" content="summary">
<meta name="twitter:image" content="https://cdn.jsdelivr.net/gh/CNhuazhu/Image/twisted.png">


<link rel="canonical" href="https://cnhuazhu.gitee.io/next/2021/01/28/Scrapy%E6%A1%86%E6%9E%B6/Scrapy%E6%A1%86%E6%9E%B6%EF%BC%88%E4%B8%80%EF%BC%89%EF%BC%9A%E5%9F%BA%E6%9C%AC%E4%BD%BF%E7%94%A8/">



<script class="next-config" data-name="page" type="application/json">{"sidebar":"","isHome":false,"isPost":true,"lang":"zh-CN","comments":true,"permalink":"https://cnhuazhu.gitee.io/next/2021/01/28/Scrapy%E6%A1%86%E6%9E%B6/Scrapy%E6%A1%86%E6%9E%B6%EF%BC%88%E4%B8%80%EF%BC%89%EF%BC%9A%E5%9F%BA%E6%9C%AC%E4%BD%BF%E7%94%A8/","path":"2021/01/28/Scrapy框架/Scrapy框架（一）：基本使用/","title":"Scrapy框架（一）：基本使用"}</script>

<script class="next-config" data-name="calendar" type="application/json">""</script>
<title>Scrapy框架（一）：基本使用 | 花猪のBlog</title>
  




  <noscript>
    <link rel="stylesheet" href="/next/css/noscript.css">
  </noscript>
</head>

<body itemscope itemtype="http://schema.org/WebPage" class="use-motion">
  <div class="headband"></div>

  <main class="main">
    <header class="header" itemscope itemtype="http://schema.org/WPHeader">
      <div class="header-inner"><div class="site-brand-container">
  <div class="site-nav-toggle">
    <div class="toggle" aria-label="切换导航栏" role="button">
        <span class="toggle-line"></span>
        <span class="toggle-line"></span>
        <span class="toggle-line"></span>
    </div>
  </div>

  <div class="site-meta">

    <a href="/next/" class="brand" rel="start">
      <i class="logo-line"></i>
      <h1 class="site-title">花猪のBlog</h1>
      <i class="logo-line"></i>
    </a>
  </div>

  <div class="site-nav-right">
    <div class="toggle popup-trigger">
        <i class="fa fa-search fa-fw fa-lg"></i>
    </div>
  </div>
</div>



<nav class="site-nav">
  <ul class="main-menu menu">
        <li class="menu-item menu-item-主站"><a href="https://cnhuazhu.top/" rel="noopener" target="_blank"><i class="fas fa-chevron-right fa-fw"></i>主站</a></li>
        <li class="menu-item menu-item-home"><a href="/next/" rel="section"><i class="fa fa-home fa-fw"></i>首页</a></li>
        <li class="menu-item menu-item-tags"><a href="/next/tags/" rel="section"><i class="fa fa-tags fa-fw"></i>标签<span class="badge">14</span></a></li>
        <li class="menu-item menu-item-categories"><a href="/next/categories/" rel="section"><i class="fa fa-th fa-fw"></i>分类<span class="badge">8</span></a></li>
        <li class="menu-item menu-item-archives"><a href="/next/archives/" rel="section"><i class="fa fa-archive fa-fw"></i>归档<span class="badge">53</span></a></li>
      <li class="menu-item menu-item-search">
        <a role="button" class="popup-trigger"><i class="fa fa-search fa-fw"></i>搜索
        </a>
      </li>
  </ul>
</nav>



  <div class="search-pop-overlay">
    <div class="popup search-popup"><div class="search-header">
  <span class="search-icon">
    <i class="fa fa-search"></i>
  </span>
  <div class="search-input-container">
    <input autocomplete="off" autocapitalize="off" maxlength="80"
           placeholder="搜索..." spellcheck="false"
           type="search" class="search-input">
  </div>
  <span class="popup-btn-close" role="button">
    <i class="fa fa-times-circle"></i>
  </span>
</div>
<div class="search-result-container no-result">
  <div class="search-result-icon">
    <i class="fa fa-spinner fa-pulse fa-5x"></i>
  </div>
</div>

    </div>
  </div>

</div>
        
  
  <div class="toggle sidebar-toggle" role="button">
    <span class="toggle-line"></span>
    <span class="toggle-line"></span>
    <span class="toggle-line"></span>
  </div>

  <aside class="sidebar">

    <div class="sidebar-inner sidebar-nav-active sidebar-toc-active">
      <ul class="sidebar-nav">
        <li class="sidebar-nav-toc">
          文章目录
        </li>
        <li class="sidebar-nav-overview">
          站点概览
        </li>
      </ul>

      <div class="sidebar-panel-container">
        <!--noindex-->
        <div class="post-toc-wrap sidebar-panel">
            <div class="post-toc animated"><ol class="nav"><li class="nav-item nav-level-1"><a class="nav-link" href="#%E5%89%8D%E8%A8%80"><span class="nav-number">1.</span> <span class="nav-text">前言</span></a></li><li class="nav-item nav-level-1"><a class="nav-link" href="#Scrapy%E6%A1%86%E6%9E%B6%E7%9A%84%E5%9F%BA%E6%9C%AC%E4%BD%BF%E7%94%A8"><span class="nav-number">2.</span> <span class="nav-text">Scrapy框架的基本使用</span></a><ol class="nav-child"><li class="nav-item nav-level-2"><a class="nav-link" href="#%E7%8E%AF%E5%A2%83%E7%9A%84%E5%AE%89%E8%A3%85"><span class="nav-number">2.1.</span> <span class="nav-text">环境的安装</span></a></li><li class="nav-item nav-level-2"><a class="nav-link" href="#%E5%88%9B%E5%BB%BAscrapy%E5%B7%A5%E7%A8%8B"><span class="nav-number">2.2.</span> <span class="nav-text">创建scrapy工程</span></a></li><li class="nav-item nav-level-2"><a class="nav-link" href="#%E6%96%87%E4%BB%B6%E5%8F%82%E6%95%B0%E7%9A%84%E4%BF%AE%E6%94%B9"><span class="nav-number">2.3.</span> <span class="nav-text">文件参数的修改</span></a></li></ol></li><li class="nav-item nav-level-1"><a class="nav-link" href="#%E5%8F%AF%E8%83%BD%E9%81%87%E5%88%B0%E7%9A%84%E9%97%AE%E9%A2%98"><span class="nav-number">3.</span> <span class="nav-text">可能遇到的问题</span></a></li><li class="nav-item nav-level-1"><a class="nav-link" href="#%E7%BB%93%E5%B0%BE"><span class="nav-number">4.</span> <span class="nav-text">结尾</span></a></li></ol></div>
        </div>
        <!--/noindex-->

        <div class="site-overview-wrap sidebar-panel">
          <div class="site-overview">
            <div class="site-author site-overview-item animated" itemprop="author" itemscope itemtype="http://schema.org/Person">
    <img class="site-author-image" itemprop="image" alt="花猪"
      src="https://cdn.jsdelivr.net/gh/CNhuazhu/Image/avatar.jpg">
  <p class="site-author-name" itemprop="name">花猪</p>
  <div class="site-description" itemprop="description">为了获得不同的阅读体验，建立此分站。<br>（可在导航栏搜索想要查看的文章）<br><br>本站主题：NEXT 8.6.1</div>
</div>
<div class="site-state-wrap site-overview-item animated">
  <nav class="site-state">
      <div class="site-state-item site-state-posts">
          <a href="/next/archives/">
        
          <span class="site-state-item-count">53</span>
          <span class="site-state-item-name">日志</span>
        </a>
      </div>
      <div class="site-state-item site-state-categories">
            <a href="/next/categories/">
          
        <span class="site-state-item-count">8</span>
        <span class="site-state-item-name">分类</span></a>
      </div>
      <div class="site-state-item site-state-tags">
            <a href="/next/tags/">
          
        <span class="site-state-item-count">14</span>
        <span class="site-state-item-name">标签</span></a>
      </div>
  </nav>
</div>
  <div class="links-of-author site-overview-item animated">
      <span class="links-of-author-item">
        <a href="https://github.com/CNhuazhu" title="GitHub → https:&#x2F;&#x2F;github.com&#x2F;CNhuazhu" rel="noopener" target="_blank"><i class="fab fa-github fa-fw"></i>GitHub</a>
      </span>
  </div>



          </div>
        </div>
      </div>
    </div>
  </aside>
  <div class="sidebar-dimmer"></div>


    </header>

    
  <div class="back-to-top" role="button" aria-label="返回顶部">
    <i class="fa fa-arrow-up"></i>
    <span>0%</span>
  </div>
  <div class="reading-progress-bar"></div>

  <a href="https://github.com/CNhuazhu" class="github-corner" title="Follow me on GitHub" aria-label="Follow me on GitHub" rel="noopener" target="_blank"><svg width="80" height="80" viewBox="0 0 250 250" aria-hidden="true"><path d="M0,0 L115,115 L130,115 L142,142 L250,250 L250,0 Z"></path><path d="M128.3,109.0 C113.8,99.7 119.0,89.6 119.0,89.6 C122.0,82.7 120.5,78.6 120.5,78.6 C119.2,72.0 123.4,76.3 123.4,76.3 C127.3,80.9 125.5,87.3 125.5,87.3 C122.9,97.6 130.6,101.9 134.4,103.2" fill="currentColor" style="transform-origin: 130px 106px;" class="octo-arm"></path><path d="M115.0,115.0 C114.9,115.1 118.7,116.5 119.8,115.4 L133.7,101.6 C136.9,99.2 139.9,98.4 142.2,98.6 C133.8,88.0 127.5,74.4 143.8,58.0 C148.5,53.4 154.0,51.2 159.7,51.0 C160.3,49.4 163.2,43.6 171.4,40.1 C171.4,40.1 176.1,42.5 178.8,56.2 C183.1,58.6 187.2,61.8 190.9,65.4 C194.5,69.0 197.7,73.2 200.1,77.6 C213.8,80.2 216.3,84.9 216.3,84.9 C212.7,93.1 206.9,96.0 205.4,96.6 C205.1,102.4 203.0,107.8 198.3,112.5 C181.9,128.9 168.3,122.5 157.7,114.1 C157.9,116.9 156.7,120.9 152.7,124.9 L141.0,136.5 C139.8,137.7 141.6,141.9 141.8,141.8 Z" fill="currentColor" class="octo-body"></path></svg></a>

<noscript>
  <div class="noscript-warning">Theme NexT works best with JavaScript enabled</div>
</noscript>


    <div class="main-inner post posts-expand">


  


<div class="post-block">
  
  

  <article itemscope itemtype="http://schema.org/Article" class="post-content" lang="zh-CN">
    <link itemprop="mainEntityOfPage" href="https://cnhuazhu.gitee.io/next/2021/01/28/Scrapy%E6%A1%86%E6%9E%B6/Scrapy%E6%A1%86%E6%9E%B6%EF%BC%88%E4%B8%80%EF%BC%89%EF%BC%9A%E5%9F%BA%E6%9C%AC%E4%BD%BF%E7%94%A8/">

    <span hidden itemprop="author" itemscope itemtype="http://schema.org/Person">
      <meta itemprop="image" content="https://cdn.jsdelivr.net/gh/CNhuazhu/Image/avatar.jpg">
      <meta itemprop="name" content="花猪">
      <meta itemprop="description" content="为了获得不同的阅读体验，建立此分站。<br>（可在导航栏搜索想要查看的文章）<br><br>本站主题：NEXT 8.6.1">
    </span>

    <span hidden itemprop="publisher" itemscope itemtype="http://schema.org/Organization">
      <meta itemprop="name" content="花猪のBlog">
    </span>
      <header class="post-header">
        <h1 class="post-title" itemprop="name headline">
          Scrapy框架（一）：基本使用
        </h1>

        <div class="post-meta-container">
          <div class="post-meta">
    <span class="post-meta-item">
      <span class="post-meta-item-icon">
        <i class="far fa-calendar"></i>
      </span>
      <span class="post-meta-item-text">发表于</span>

      <time title="创建时间：2021-01-28 14:11:46" itemprop="dateCreated datePublished" datetime="2021-01-28T14:11:46+08:00">2021-01-28</time>
    </span>
      <span class="post-meta-item">
        <span class="post-meta-item-icon">
          <i class="far fa-calendar-check"></i>
        </span>
        <span class="post-meta-item-text">更新于</span>
        <time title="修改时间：2021-02-19 21:44:46" itemprop="dateModified" datetime="2021-02-19T21:44:46+08:00">2021-02-19</time>
      </span>
    <span class="post-meta-item">
      <span class="post-meta-item-icon">
        <i class="far fa-folder"></i>
      </span>
      <span class="post-meta-item-text">分类于</span>
        <span itemprop="about" itemscope itemtype="http://schema.org/Thing">
          <a href="/next/categories/Python/" itemprop="url" rel="index"><span itemprop="name">Python</span></a>
        </span>
    </span>

  
    <span id="/next/2021/01/28/Scrapy%E6%A1%86%E6%9E%B6/Scrapy%E6%A1%86%E6%9E%B6%EF%BC%88%E4%B8%80%EF%BC%89%EF%BC%9A%E5%9F%BA%E6%9C%AC%E4%BD%BF%E7%94%A8/" class="post-meta-item leancloud_visitors" data-flag-title="Scrapy框架（一）：基本使用" title="阅读次数">
      <span class="post-meta-item-icon">
        <i class="far fa-eye"></i>
      </span>
      <span class="post-meta-item-text">阅读次数：</span>
      <span class="leancloud-visitors-count"></span>
    </span>
    <span id="/next/2021/01/28/Scrapy%E6%A1%86%E6%9E%B6/Scrapy%E6%A1%86%E6%9E%B6%EF%BC%88%E4%B8%80%EF%BC%89%EF%BC%9A%E5%9F%BA%E6%9C%AC%E4%BD%BF%E7%94%A8/" class="post-meta-item twikoo_visitors" data-flag-title="Scrapy框架（一）：基本使用" title="阅读次数">
      <span class="post-meta-item-icon">
        <i class="far fa-eye"></i>
      </span>
      <span class="post-meta-item-text">阅读次数：</span>
      <span id="twikoo_visitors"></span>
    </span>
  
      </div>
      <div class="post-meta">
    <span class="post-meta-item" title="本文字数">
      <span class="post-meta-item-icon">
        <i class="far fa-file-word"></i>
      </span>
      <span class="post-meta-item-text">本文字数：</span>
      <span>2.1k</span>
    </span>
    <span class="post-meta-item" title="阅读时长">
      <span class="post-meta-item-icon">
        <i class="far fa-clock"></i>
      </span>
      <span class="post-meta-item-text">阅读时长 &asymp;</span>
      <span>2 分钟</span>
    </span>
</div>

            <div class="post-description">简单介绍Scrapy框架的基本使用以及遇到的问题</div>
        </div>
      </header>

    
    
    
    <div class="post-body" itemprop="articleBody">
        <h1 id="前言"><a href="#前言" class="headerlink" title="前言"></a>前言</h1><p>本篇文章简单介绍一下Scrapy框架的基本使用方法，以及在使用过程中遇到的一些问题和解决方案。</p>
<hr>
<h1 id="Scrapy框架的基本使用"><a href="#Scrapy框架的基本使用" class="headerlink" title="Scrapy框架的基本使用"></a>Scrapy框架的基本使用</h1><h2 id="环境的安装"><a href="#环境的安装" class="headerlink" title="环境的安装"></a>环境的安装</h2><p><strong>1.输入下述指令安装<code>wheel</code></strong></p>
<figure class="highlight powershell"><table><tr><td class="gutter"><pre><span class="line">1</span><br></pre></td><td class="code"><pre><span class="line">pip install wheel</span><br></pre></td></tr></table></figure>

<p><strong>2.下载<code>twisted</code></strong></p>
<p>这里提供一个下载链接:<a target="_blank" rel="noopener" href="http://www.lfd.uci.edu/~gohlke/pythonlibs/#twisted">http://www.lfd.uci.edu/~gohlke/pythonlibs/#twisted</a></p>
<p><img src="https://cdn.jsdelivr.net/gh/CNhuazhu/Image/twisted.png"></p>
<blockquote>
<p>注：这里下载有两点需要注意：</p>
<ol>
<li>要下载与自己python版本相对应的文件，<code>cpxx</code>为版本号。（例如我的python版本为3.8.2，就下载cp38的文件）</li>
<li>根据操作系统位数下载对应文件。32位操作系统下载<code>win32</code>；64位操作系统下载win_amd64。</li>
</ol>
</blockquote>
<p><strong>3.安装<code>twisted</code></strong></p>
<p>在上一步下载好的twisted的目录下输入下面的命令：</p>
<figure class="highlight powershell"><table><tr><td class="gutter"><pre><span class="line">1</span><br></pre></td><td class="code"><pre><span class="line">pip install Twisted<span class="literal">-20</span>.<span class="number">3.0</span><span class="literal">-cp38</span><span class="literal">-cp38</span><span class="literal">-win_amd64</span>.whl</span><br></pre></td></tr></table></figure>

<p><strong>4.输入下述指令安装<code>pywin32</code></strong></p>
<figure class="highlight powershell"><table><tr><td class="gutter"><pre><span class="line">1</span><br></pre></td><td class="code"><pre><span class="line">pip install pywin32</span><br></pre></td></tr></table></figure>

<p><strong>5.输入下述指令安装<code>scrapy</code></strong></p>
<figure class="highlight powershell"><table><tr><td class="gutter"><pre><span class="line">1</span><br></pre></td><td class="code"><pre><span class="line">pip install scrapy</span><br></pre></td></tr></table></figure>

<p><strong>6.测试</strong></p>
<p>在终端里输入<code>scrapy</code>命令，没有报错即表示安装成功。</p>
<h2 id="创建scrapy工程"><a href="#创建scrapy工程" class="headerlink" title="创建scrapy工程"></a>创建scrapy工程</h2><p>这里是在PyCharm中创建的scrapy工程</p>
<p><strong>1.打开<code>Terminal</code>面板，输入下述指令创建一个scrapy工程</strong></p>
<figure class="highlight powershell"><table><tr><td class="gutter"><pre><span class="line">1</span><br></pre></td><td class="code"><pre><span class="line">scrapy startproject ProjectName</span><br></pre></td></tr></table></figure>

<blockquote>
<p><code>ProjectName</code>为项目名称，自己定义。</p>
</blockquote>
<p><strong>2.自动生成如下目录</strong></p>
<p><img src="https://cdn.jsdelivr.net/gh/CNhuazhu/Image/tree.png"></p>
<p><strong>3.创建一个爬虫文件</strong></p>
<p>首先进入刚刚创建的工程目录下：</p>
<figure class="highlight powershell"><table><tr><td class="gutter"><pre><span class="line">1</span><br></pre></td><td class="code"><pre><span class="line"><span class="built_in">cd</span> ProjectName</span><br></pre></td></tr></table></figure>

<p>然后在spiders子目录中创建一个爬虫文件</p>
<figure class="highlight powershell"><table><tr><td class="gutter"><pre><span class="line">1</span><br></pre></td><td class="code"><pre><span class="line">scrapy genspider spiderName www.xxx.com</span><br></pre></td></tr></table></figure>

<blockquote>
<p><code>spiderName</code>为爬虫文件名称，自己定义。</p>
</blockquote>
<p><strong>4.执行工程</strong></p>
<figure class="highlight powershell"><table><tr><td class="gutter"><pre><span class="line">1</span><br></pre></td><td class="code"><pre><span class="line">scrapy crawl spiderName</span><br></pre></td></tr></table></figure>

<hr>
<h2 id="文件参数的修改"><a href="#文件参数的修改" class="headerlink" title="文件参数的修改"></a>文件参数的修改</h2><p>为了能更好的执行爬虫项目，需要修改一些文件的参数。</p>
<p><strong>1.<code>spiderName.py</code></strong></p>
<p>该爬虫文件的内容如下：</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br></pre></td><td class="code"><pre><span class="line"><span class="keyword">import</span> scrapy</span><br><span class="line"></span><br><span class="line"><span class="class"><span class="keyword">class</span> <span class="title">FirstSpider</span>(<span class="params">scrapy.Spider</span>):</span></span><br><span class="line">    <span class="comment"># 爬虫文件的名称：就是爬虫源文件的一个唯一标识</span></span><br><span class="line">    name = <span class="string">&#x27;spiderName&#x27;</span></span><br><span class="line">    <span class="comment"># 允许的域名：用来限定start_urls列表中哪些url可以进行请求发送</span></span><br><span class="line">    allowed_domains = [<span class="string">&#x27;www.baidu.com&#x27;</span>]</span><br><span class="line">    <span class="comment"># 起始的url列表：该列表中存放的url会被scrapy自动进行请求的发送</span></span><br><span class="line">    start_urls = [<span class="string">&#x27;http://www.baidu.com/&#x27;</span>,<span class="string">&#x27;https://www.douban.com&#x27;</span>]</span><br><span class="line"></span><br><span class="line">    <span class="comment"># 用于数据解析：response参数表示的就是请求成功后对应的响应对象</span></span><br><span class="line">    <span class="function"><span class="keyword">def</span> <span class="title">parse</span>(<span class="params">self, response</span>):</span></span><br><span class="line">        <span class="keyword">pass</span></span><br></pre></td></tr></table></figure>

<blockquote>
<p>注：<code>allowed_domains</code>列表用来限定请求的url。一般情况不需要，将其注释掉即可。</p>
</blockquote>
<p><strong>2.<code>settings.py</code></strong></p>
<p>1). ROBOTSTXT_OBEY</p>
<p>找到<code>ROBOTSTXT_OBEY</code>关键字，此处默认参数为Ture。（即项目默认遵守<code>robots协议</code>）为了项目练习，可以暂时将其改为<code>False</code>。</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br></pre></td><td class="code"><pre><span class="line"><span class="comment"># Obey robots.txt rules</span></span><br><span class="line">ROBOTSTXT_OBEY = <span class="literal">False</span></span><br></pre></td></tr></table></figure>

<p>2). USER_AGENT</p>
<p>找到<code>USER_AGENT</code>关键字，此处默认注释掉了。修改其内容，以避免UA反爬。</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br></pre></td><td class="code"><pre><span class="line"><span class="comment"># Crawl responsibly by identifying yourself (and your website) on the user-agent</span></span><br><span class="line">USER_AGENT = <span class="string">&#x27;Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36 Edg/88.0.705.50&#x27;</span></span><br></pre></td></tr></table></figure>

<p>3). LOG_LEVEL</p>
<p>为了更清晰的查看项目运行结果（项目默认运行结果会打印大量的日志信息），可以手动添加<code>LOG_LEVEL</code>关键字。</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br></pre></td><td class="code"><pre><span class="line"><span class="comment"># 显示指定类型的日志信息</span></span><br><span class="line">LOG_LEVEL = <span class="string">&#x27;ERROR&#x27;</span> <span class="comment"># 只显示错误信息</span></span><br></pre></td></tr></table></figure>

<hr>
<h1 id="可能遇到的问题"><a href="#可能遇到的问题" class="headerlink" title="可能遇到的问题"></a>可能遇到的问题</h1><p><strong>1.成功安装完<code>scrapy</code>，但是在创建爬虫文件后依然显示<code>import scrapy</code>有误。</strong></p>
<p>本人练习时用的环境都是基于Python3.8创建的各种虚拟环境，然而在搭建scrapy项目时<code>pip install scrapy</code>始终报错。</p>
<p>最初手动在官网：<a target="_blank" rel="noopener" href="https://scrapy.org/">https://scrapy.org/</a> 下载scrapy库，然后安装到虚拟环境的<code>site-packages</code>目录下，果然回头看<code>import scrapy</code>显示正常了，程序也可以跑。但是依然打印大量的错误信息，通过PyCharm的<code>Python Interpreter</code>查看并没有<code>Scrapy</code>库在内。</p>
<p>无奈又尝试了一些解决方案，无果……</p>
<p>最后发现<code>Anaconda</code>自带<code>Scrapy</code>库，于是又基于<code>Anaconda</code>创建了一个虚拟环境，完美运行~~~~</p>
<hr>
<h1 id="结尾"><a href="#结尾" class="headerlink" title="结尾"></a>结尾</h1><p><em><strong>好好学习</strong></em></p>

    </div>

    
    
    
      


    <footer class="post-footer">
          <div class="reward-container">
  <div></div>
  <button>
    赞赏
  </button>
  <div class="post-reward">
      <div>
        <img src="https://cdn.jsdelivr.net/gh/CNhuazhu/Image/wechat.png" alt="花猪 微信">
        <span>微信</span>
      </div>
      <div>
        <img src="https://cdn.jsdelivr.net/gh/CNhuazhu/Image/alipay.jpg" alt="花猪 支付宝">
        <span>支付宝</span>
      </div>

  </div>
</div>

          <div class="post-tags">
              <a href="/next/tags/Scrapy/" rel="tag"><i class="fa fa-tag"></i> Scrapy</a>
          </div>

        

          <div class="post-nav">
            <div class="post-nav-item">
                <a href="/next/2021/01/27/Hexo%E5%86%99%E4%BD%9C/" rel="prev" title="Hexo写作">
                  <i class="fa fa-chevron-left"></i> Hexo写作
                </a>
            </div>
            <div class="post-nav-item">
                <a href="/next/2021/02/06/Python%20OSError-Errno-22-Invalid-argument%20%E9%94%99%E8%AF%AF%E8%A7%A3%E5%86%B3/" rel="next" title="Python OSError: [Errno 22] Invalid argument 错误解决">
                  Python OSError: [Errno 22] Invalid argument 错误解决 <i class="fa fa-chevron-right"></i>
                </a>
            </div>
          </div>
    </footer>
  </article>
</div>






    <div class="comments"><div id="twikoo-comments"></div></div>
</div>
  </main>

  <footer class="footer">
    <div class="footer-inner">


<div class="copyright">
  &copy; 
  <span itemprop="copyrightYear">2021</span>
  <span class="with-love">
    <i class="fa fa-heart"></i>
  </span>
  <span class="author" itemprop="copyrightHolder">花猪</span>
</div>
<div class="wordcount">
  <span class="post-meta-item">
    <span class="post-meta-item-icon">
      <i class="fa fa-chart-line"></i>
    </span>
      <span>站点总字数：</span>
    <span title="站点总字数">356k</span>
  </span>
  <span class="post-meta-item">
    <span class="post-meta-item-icon">
      <i class="fa fa-coffee"></i>
    </span>
      <span>站点阅读时长 &asymp;</span>
    <span title="站点阅读时长">5:23</span>
  </span>
</div>
  <div class="powered-by">由 <a href="https://hexo.io/" rel="noopener" target="_blank">Hexo</a> & <a href="https://theme-next.js.org/mist/" rel="noopener" target="_blank">NexT.Mist</a> 强力驱动
  </div>

    </div>
  </footer>

  
  <script src="https://cdn.jsdelivr.net/npm/animejs@3.2.1/lib/anime.min.js" integrity="sha256-XL2inqUJaslATFnHdJOi9GfQ60on8Wx1C2H8DYiN1xY=" crossorigin="anonymous"></script>
  <script src="https://cdn.jsdelivr.net/npm/@next-theme/pjax@0.5.0/pjax.min.js" integrity="sha256-3NkoLDrmHLTYj7csHIZSr0MHAFTXth7Ua/DDt4MRUAg=" crossorigin="anonymous"></script>
  <script src="https://cdn.jsdelivr.net/npm/jquery@3.6.0/dist/jquery.min.js" integrity="sha256-/xUj+3OJU5yExlq6GSYGSHk7tPXikynS7ogEvDej/m4=" crossorigin="anonymous"></script>
  <script src="https://cdn.jsdelivr.net/npm/@fancyapps/fancybox@3.5.7/dist/jquery.fancybox.min.js" integrity="sha256-yt2kYMy0w8AbtF89WXb2P1rfjcP/HTHLT7097U8Y5b8=" crossorigin="anonymous"></script>
  <script src="https://cdn.jsdelivr.net/npm/medium-zoom@1.0.6/dist/medium-zoom.min.js" integrity="sha256-EdPgYcPk/IIrw7FYeuJQexva49pVRZNmt3LculEr7zM=" crossorigin="anonymous"></script>
<script src="/next/js/comments.js"></script><script src="/next/js/utils.js"></script><script src="/next/js/motion.js"></script><script src="/next/js/schemes/muse.js"></script><script src="/next/js/next-boot.js"></script><script src="/next/js/pjax.js"></script>

  
<script src="/next/js/third-party/search/local-search.js"></script>




  


  <script class="next-config" data-name="leancloud_visitors" type="application/json">{"enable":true,"app_id":"dh8GVAcIwU1TN4zhA5y63iW1-gzGzoHsz","app_key":"hzyAqDKjG4OBGrdPS7mKaOa7","server_url":null,"security":true}</script>
  <script src="/next/js/third-party/statistics/lean-analytics.js"></script>


  

  <script class="next-config" data-name="enableMath" type="application/json">true</script><script class="next-config" data-name="mathjax" type="application/json">{"enable":true,"tags":"none","js":{"url":"https://cdn.jsdelivr.net/npm/mathjax@3.2.0/es5/tex-mml-chtml.js","integrity":"sha256-r+3itOMtGGjap0x+10hu6jW/gZCzxHsoKrOd7gyRSGY="}}</script>
<script src="/next/js/third-party/math/mathjax.js"></script>


<script class="next-config" data-name="twikoo" type="application/json">{"enable":true,"visitor":true,"envId":"hexo-1g41404w9f800e94","el":"#twikoo-comments"}</script>
<script>
document.addEventListener('page:loaded', () => {
  NexT.utils.loadComments(CONFIG.twikoo.el)
    .then(() => NexT.utils.getScript(
      'https://cdn.jsdelivr.net/npm/twikoo/dist/twikoo.all.min.js',
      { condition: window.twikoo }
    ))
    .then(() => {
      twikoo.init(CONFIG.twikoo);
    });
});
</script>

</body>
</html>
